Here, we build random forest models of vote shares in the Swedish general election of 2014. The input data consist of (i) vote percentages for the 290 municipalities in Sweden, (ii) a dataset with various indicators for these municipalities, such as unemployment rate, urban/rural and so on, and (iii) number of asylum seekers per capita for munipalities in 2014.
In this example, we will demonstrate LIME's tabular mode for when you have "normal" tabular data and not e.g. text or image data. We will show LIME's "tabular explainer" both in regression and in classification mode.
import pandas as pd
import sklearn
import sklearn.datasets
import sklearn.ensemble
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import lime
import lime.lime_tabular
%matplotlib inline
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
import mpld3 # If plotly is down
vote_data = pd.read_csv("municipality_votes_2014.tsv", delimiter="\t", index_col=0)
columns_to_use = ["PROCENT_M","PROCENT_C","PROCENT_FP","PROCENT_KD","PROCENT_S","PROCENT_V","PROCENT_MP","PROCENT_SD","PROCENT_FI"]
vote_perc = vote_data[columns_to_use]
vote_perc.columns = [x.split('_')[1] for x in vote_perc.columns ]
After some light pre-processing, we get a matrix with the vote share for the nine largest parties per municipality.
vote_perc.head()
Now we join this table with the municipality descriptors and asylum numbers.
municipality_desc = pd.read_excel("kommundata.xlsx")
asylum_seekers = pd.read_excel("asylsdochm.xlsx")
asylum_seekers = asylum_seekers.iloc[2:, [1,4]]
asylum_seekers.columns = ["Municipality","AsylumSeekers"]
asylum_seekers.AsylumSeekers=asylum_seekers.AsylumSeekers.astype(float)
muni = municipality_desc.merge(asylum_seekers, left_on="name", right_on="Municipality")
muni.head()
muni.index = muni.name
Great, now we have a table which contains the information we need. For convenience, we also make two lookup tables that can find the municipality code from the municipality name, and vice versa.
code_from_name = {}
name_from_code = {}
for row in range(muni.shape[0]):
code_from_name[muni.iloc[row,1]]=muni.iloc[row,0]
name_from_code[muni.iloc[row,0]]=muni.iloc[row,1]
Next, we drop some of the features that are either redundant or that we don't think are interesting.
features_to_drop = ['code', 'name', 'youthUnemployment2010', 'unemployment2010', 'satisfactionInfluence','satisfactionGeneral', 'satisfactionElderlyCare', 'Municipality']
muni = muni.drop(features_to_drop, axis=1)
muni.head()
As our final pre-processing steps, we convert categorical variables to dummy variables and create a couple of binary classification variables for whether SD or C achieved more than 15% of the votes in each municipality.
X = pd.get_dummies(muni)
sd_over_15 = vote_perc.SD > 15.0
c_over_10 = vote_perc.C > 10
Let's try to model the vote share of MP, for instance.
X_train, X_test, y_train, y_test = train_test_split(X, vote_perc.MP, test_size=0.33, random_state=42)
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
How are we doing on the test set?
trace = go.Scatter(
x = rf.predict(X_test),
y = y_test,
mode = 'markers',
text = X_test.index
)
layout = go.Layout(
title='MP vote share, actual vs predicted',
xaxis=dict(
title='Predicted'
),
yaxis=dict(
title='Actual'
)
)
data = [trace]
# Plot and embed in ipython notebook!
from plotly.offline import iplot
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='axss-labels')
#iplot(fig, filename='axss-labels')
# Backup cell
from matplotlib import pyplot as plt
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
N = len(X_test)
scatter = ax.scatter(rf.predict(X_test),
y_test) #,
#c=np.random.random(size=N),
#s=1000 * np.random.random(size=N),
#alpha=0.3,
#cmap=plt.cm.jet)
ax.grid(color='white', linestyle='solid')
ax.set_title("Actual vs. predicted", size=20)
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=list(X_test.index.values))
mpld3.plugins.connect(fig, tooltip)
mpld3.display()
Let's try to explain a prediction from above! Note that the input need to be a (2d) Numpy array, and not for example a Pandas data frame.
ix = np.where(X_test.index == 'Täby')[0].tolist()[0]
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train), mode='regression', feature_names=X.columns.tolist(), training_labels=y_train)
exp = explainer.explain_instance(np.array(X_test)[ix,:], rf.predict, num_features=3, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=True)
We can try to build a classifier that predicts whether some party will get at least a certain percentage, e.g. does SD get more than 15% or C more than 10%?
X_train, X_test, y_train, y_test = train_test_split(X, sd_over_15, test_size=0.33, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
sklearn.metrics.confusion_matrix(y_pred=rf.predict(X_test), y_true=y_test)
#len(np.where(y_test==True)[0]) / len(y_test)
Let's look at false positives, where the algorithm thought SD would get >15% but it didn't. Why did it think so?
df = pd.DataFrame({'predicted': rf.predict(X_test),
'actual': y_test,
'town': X_test.index})
df[(df['actual'] != df['predicted'])]
For example, Degerfors. What percentage did it have in fact?
vote_perc.loc[1862]
ix = np.where(X_test.index == 'Degerfors')[0].tolist()[0]
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train), feature_names=X.columns.tolist(), training_labels=sd_over_15)
exp = explainer.explain_instance(np.array(X_test)[ix,:], rf.predict_proba, num_features=3, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)
We could also try to predict which party will get the most votes. Check which parties that actually have received the most votes in any municipality.
largest = vote_perc.idxmax(axis=1)
largest.value_counts()
X_train, X_test, y_train, y_test = train_test_split(X, largest, test_size=0.33, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
sklearn.metrics.confusion_matrix(y_pred=rf.predict(X_test), y_true=y_test)
We could look at, for example, where the random forest predicted S as the largest party but M actually won.
df = pd.DataFrame({'predicted': rf.predict(X_test),
'actual': y_test,
'town': X_test.index})
df[(df['actual'] == 'M') & (df['predicted'] == 'S')]
ix = np.where(X_test.index == 'Ängelholm')[0].tolist()[0]
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train), feature_names=X.columns.tolist(), training_labels=largest, class_names=['M','S','SD'])
exp = explainer.explain_instance(np.array(X_test)[ix,:], rf.predict_proba, num_features=3, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=False)
Or the converse (predicted M, but S was the largest.)
df[(df['actual'] == 'S') & (df['predicted'] == 'M')]
ix = np.where(X_test.index == 'Kungälv')[0].tolist()[0]
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train), feature_names=X.columns.tolist(), training_labels=largest, class_names=['M','S','SD'])
exp = explainer.explain_instance(np.array(X_test)[ix,:], rf.predict_proba, num_features=3, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=True)